#!/usr/bin/env python3
# -*- coding: utf-8 -*-

# require gensim jieba

from utils import *

STOPWORDS = {' ', '：', '。', '，', '、' '？', "；"}

def get_corpus(index='3900674'):
    URL = "https://www.liuxue86.com/a/%s.html" % index

    soup = url2soup(URL)
    qingshu=soup.find('div', {'class':'main_zhengw'})
    sentences = []
    ps = qingshu.find_all('p')

    for i in range(len(ps)):
        s = ps[i].text.strip()
        if s[0].isdigit():
            break
    ps = ps[i:]
    for i in range(len(ps)):
        s = ps[i].text.strip()
        if s[0].isdigit():
            sentences.append(s.lstrip('0123456789. \xa0'))
        else:
            sentences[-1] += s

    import jieba

    corpus = [[w for w in jieba.cut(s) if w not in STOPWORDS] for s in sentences]
    return corpus


def topic(corpus, num_topics=5, *args, **kwargs):
    import gensim
    id2word = gensim.corpora.Dictionary(corpus)
    corpus = [id2word.doc2bow(text) for text in corpus]
    lda = gensim.models.ldamodel.LdaModel(corpus=corpus, id2word=id2word, num_topics=num_topics, *args, **kwargs)
    print('============== TOPICS ================')
    for k, t in lda.print_topics(5, num_words=10):
        print('TOPIC %d:' %k, t)

# from gensim.models import word2vec

# try:
#     model=word2vec.Word2Vec.load('mymodel')
# except:
#     corpus = get_corpus()
#     model = word2vec.Word2Vec(corpus, min_count=5, workers=5, size=100, window=5)
#     model.save('mymodel')

corpus = get_corpus(index='3900674')
topic(corpus)

